options("scipen"=99999)
bmp <- read.csv('brainmaster.csv',head=T,sep=',',na.strings=c("NA","-"))
bmp$firstday <- as.Date(bmp$firstseen)
bmp$firstm <-cut(bmp$firstday,breaks="month")

bsx<-read.csv("bitstampex.csv",head=T,sep=",")
bsx$tradeday <- as.Date(bsx$tradeday)
bmp<-merge(bmp,bsx,by.x="firstday",by.y="tradeday",all.x=T)

bmp$totalusd<-bmp$btcusd*bmp$totalamt/100000000.
bmp$totalusd[is.na(bmp$totalusd)]<-0
bmnr<-bmp[bmp$researcher==F,]
bmnr$numdrains[is.na(bmnr$numdrains)] <- 0

txo <- read.csv("transout.csv",head=F,na.strings=c("NA","-"))
names(txo)<-c("addr","txid","outaddr","amt","date")
txo$outday <- as.Date(txo$date)

txo<-merge(txo,bsx,by.x="outday",by.y="tradeday",all.x=T)
txo$outusd<-txo$btcusd*txo$amt/100000000.

abvo<-data.frame(tapply(txo$outusd,txo$outaddr,sum,na.rm=T))
names(abvo)<-"drainusd"
abvo$outaddr<-row.names(abvo)

abbr<-data.frame(tapply(txo$outusd,txo$addr,sum,na.rm=T))
abbr$addr<-rownames(abbr)
names(abbr)<-c("outusd","addr")

bmnr<-merge(bmnr,abbr,by.x="address",by.y="addr",all.x=T)

#rank-order plot of total amount in satoshi and usd
RankOrderPlot <- function(vec, ...) {
vec<-sort(vec, decreasing = T)
vec2<- NULL
for(i in 1:length(vec)) vec2<-c(vec2,sum(vec[1:i])/sum(vec))
plot(y=100*vec2, x= 1:length(vec),...)
}

tde<-ecdf(bmnr$outusd)
pdf("fig/tacdfusd.pdf",height=4,width=8)
par(mfrow=c(1,2))
par(mar=c(4,4,2.5,1))
plot(x=environment(tde)$x,y=environment(tde)$y,col='black',lwd=3,type='s',xlab = "Total USD in Wallet", ylab = "P(total USD) <= x",log='x',main='CDF: Wallet Value (USD)',cex=1.35,cex.axis=1.2,cex.lab=1.2)
RankOrderPlot(bmnr$outusd,log='x',lty="solid",type='s',cex=2.5,ylim=c(0,100),xlab="# Largest Wallets",ylab="% Total USD",main="Rank-Order: Total Value (USD)",lwd=3,cex.axis=1.2,cex.lab=1.2)
dev.off()


#count the # brain wallets that are drained to outaddr
txob<-merge(bmnr,txo,by.x="address",by.y="addr")
bo<-txob[,c("address","outaddr")]

bos<-sort(tapply(bo$address,bo$outaddr,length),decreasing=T)
bos<-data.frame(bos)
bos$drainaddr<-row.names(bos)
names(bos)<-c("numdrained","drainaddr")
bos<-merge(bos,abvo,by.x="drainaddr",by.y="outaddr")
drainers<-bos$drainaddr[bos$numdrained>=5&bos$drainusd>=100]

usdt<-data.frame(tapply(bmnr$outusd,bmnr$firstm,sum,na.rm=T))
names(usdt) <- "usdt"
usdt$usdt[is.na(usdt$usdt)] <- 0

bwin<-read.csv("transin.csv",sep=",",head=F)
names(bwin)<-c("address","txid","inamt","intime")
bwin$inday<-as.Date(bwin$intime)
bwin$inm<-cut(bwin$inday,breaks="month")
bwin<-merge(bwin,bsx,by.x="inday",by.y="tradeday",all.x=T)
bwin$inusd<-bwin$btcusd*bwin$inamt/100000000.

#num wallets over time (stacked bar chart with compressed / uncompressed
monames<-c("2011-07","","2011-09","","2011-11","","2012-01","","2012-03","","2012-05","","2012-07","","2012-09","","2012-11","","2013-01","","2013-03","","2013-05","","2013-07","","2013-09","","2013-11","","2014-01","","2014-03","","2014-05","","2014-07","","2014-09","","2014-11","","2015-01","","2015-03","","2015-05","","2015-07","","2015-09","","2015-11","","2016-01","","2016-03","","2016-05","","2016-07","","2016-09","","2016-11","","2017-01")

tb <- data.frame(table(bmnr$compressed,bmnr$firstm))
names(tb) <- c('compressed','firstm','freq')
tb$firstm <- as.Date(tb$firstm)

pdf("fig/bwtime.pdf",height=6.5,width=10)
par(mar=c(6,4,0.5,4))
bp<-barplot(table(bmnr$compressed,bmnr$firstm),ylab='# wallets',las=2,legend.text=c("compressed","uncompressed"),names.arg=monames,cex.names=1.3,cex.axis=1.3,cex.lab=1.3)
par(new=T)
plot(bp,tapply(bwin$inusd,bwin$inm,sum,na.rm=T)[1:67],col="blue",type='l',lwd=2,axes=F,ylim=c(0,45000),xlab="",ylab="")#cutting off February...
axis(4,at = seq(0,45000,5000),cex=1.3)
mtext("USD",side=4,line=2.5) 
legend("topleft","New wallet value (USD)",lwd=2,col='blue')
dev.off()

bwusd<-data.frame(tapply(bwin$inusd,bwin$address,sum,na.rm=T))
names(bwusd) <- "totalinusd"
bwusd$address <- row.names(bwusd)

bmnr<-merge(bmnr,bwusd,by="address",all.x=T)

tvale<-ecdf(bmnr$firstTTD/3600.)
tvale1<-ecdf(bmnr$firstTTD[bmnr$totalinusd<=.01]/3600.)
tvale0 <- ecdf(bmnr$firstTTD[bmnr$totalinusd<=.001]/3600.)


pdf("fig/ttdvalcdf.pdf",height=5.7,width=5)
par(mar=c(4,4,3,0.5))
plot(x=environment(tvale)$x,y=environment(tvale)$y,col=1,lwd=3,type='s',xlab = "Hours to Drain", ylab = "P(TTD) <= x hours",cex=1.6,cex.axis=1.3,cex.lab=1.3,main="CDF: Time-to-Drain by Wallet Value")
lines(x=environment(tvale0)$x,y=environment(tvale0)$y,col=4,lwd=3,type='s',lty=3)
lines(x=environment(tvale1)$x,y=environment(tvale1)$y,col=2,lwd=3,type='s',lty=2)

legend("bottomright",lwd=3,lty=1:3,col=c(1,2,4),legend=c("Overall","<$0.01","<$0.001"),cex=1.3)
dev.off()

ttdf <- read.csv('timetodrain.csv',head=T,sep=',',na.strings=c("NA","-"))
t2<-merge(ttdf,bmp,by.x="Addr",by.y="address")


t2$inday <- as.Date(t2$timein)
t2$inq <-cut(t2$inday,breaks="quarter")
t2$inm <-cut(t2$inday,breaks="month")
t2$iny <-cut(t2$inday,breaks="year")

t2nr<-t2[t2$researcher==F,]
monames2<-c("2011-07","","2011-09","","2011-11","","2012-01","","2012-03","","2012-05","","2012-07","","2012-09","","2012-11","","2013-01","","2013-03","","2013-05","","2013-07","","2013-09","","2013-11","","2014-01","","2014-03","","2014-05","","2014-07","","2014-09","","2014-11","","2015-01","","2015-03","","2015-05","","2015-07","","2015-09","","2015-11","","2016-01","","2016-03","","2016-05","","2016-07","","2016-09","","2016-11","","2017-01","")
pdf("fig/ttdtime.pdf",height=5,width=10)
par(mar=c(6,4,0.5,0.5))
bp<-barplot(tapply(t2nr$secstodrain/3600.,t2nr$inm,median,na.rm=T),ylab='median # hours to drain',las=2,names.arg=monames2,cex.names=1.3,cex.axis=1.3,cex.lab=1.4)
dev.off()

#does wheeler zxcvbn strength matter?
strength<-read.csv('pwdstrength.csv',head=F)
names(strength) <- c('address','strength')
sbmnr <- merge(bmnr, strength, all.X=T)
cor.test(sbmnr$strength, as.numeric(sbmnr$firstseen), method="spearman")
cor.test(sbmnr$strength, sbmnr$totalamt, method="spearman")
cor.test(sbmnr$strength, sbmnr$firstTTD, method="spearman")
cor.test(sbmnr$strength, as.numeric(sbmnr$lastseen), method="spearman")
cor.test(sbmnr$strength, sbmnr$endbalance, method="spearman")
cor.test(sbmnr$strength, sbmnr$numdrains, method="spearman")

#now we're going to look at the source of the passwords
sourcenames <- c('bitsig', 'brainy', 'brute', 'combquotes', 'dumps', 'facebook', 'industry', 'irc', 'keyboard', 'lyrics', 'misc', 'openwall', 'purdue', 'reddit', 'urbandict', 'wikipedia', 'wikiquote', 'xkcd')
pwdlists <- data.frame(list=sourcenames, type =c('mixed','phrase','word','phrase','word','word','word','phrase','word','phrase','mixed','word','word','phrase','word','phrase','phrase','phrase'))

att <- read.csv('attacksrc.csv',head=T,sep=',') #one entry per address
att2<-merge(bmnr,att,by.x="address",by.y="Addr")

att3 <- read.csv('attacksrc2.csv',head=T,sep=',') #one entry per (address,list) entity
att4<-merge(bmnr,att3,by.x="address",by.y="password")

att5 <- merge(att4, pwdlists)
ttdecdf<-ecdf(att5$firstTTD/3600.)
ttdecdf2<-ecdf(att5$firstTTD[att5$type=='phrase']/3600.)
ttdecdf3<-ecdf(att5$firstTTD[att5$type=='word']/3600.)
ttdecdf4<-ecdf(att5$firstTTD[att5$type=='mixed']/3600.)

pdf("fig/ttdbylist.pdf",height=5,width=5)
plot(x=environment(ttdecdf)$x,y=environment(ttdecdf)$y,col='black',lwd=2,type='s',xlab = "Hours to Drain", ylab = "P(TTD) <= x hours")
lines(x=environment(ttdecdf2)$x,y=environment(ttdecdf2)$y, col=2,lwd=2,type='s')
lines(x=environment(ttdecdf3)$x,y=environment(ttdecdf3)$y, col=3,lwd=2,type='s')
lines(x=environment(ttdecdf4)$x,y=environment(ttdecdf4)$y, col=4,lwd=2,type='s')
legend('bottomright',c('all','passphrase','password','mixed'),col=1:4,pch=16)
dev.off()

ttdecdf<-ecdf(bmnr$firstTTD/3600.)
ttdecdf2<-ecdf(bmnr$firstTTD[bmnr$pwdlength>20]/3600.)
ttdecdf3<-ecdf(bmnr$firstTTD[bmnr$pwdlength<=5]/3600.)
ttdecdf4<-ecdf(bmnr$firstTTD[bmnr$pwdlength<=10 & bmnr$pwdlength>5]/3600.)
ttdecdf5<-ecdf(bmnr$firstTTD[bmnr$pwdlength>10 & bmnr$pwdlengt<=20]/3600.)


pdf("fig/ttdbylength.pdf",height=5,width=5)
plot(x=environment(ttdecdf)$x,y=environment(ttdecdf)$y,col='black',lwd=2,type='s',xlab = "Hours to Drain", ylab = "P(TTD) <= x hours")
lines(x=environment(ttdecdf2)$x,y=environment(ttdecdf2)$y, col=2,lwd=2,type='s')
lines(x=environment(ttdecdf3)$x,y=environment(ttdecdf3)$y, col=3,lwd=2,type='s')
lines(x=environment(ttdecdf4)$x,y=environment(ttdecdf4)$y, col=4,lwd=2,type='s')
lines(x=environment(ttdecdf5)$x,y=environment(ttdecdf5)$y, col=5,lwd=2,type='s')
legend('bottomright',c('all','0-5','6-10','11-20','20+'),col=c(1,3,4,5,2),pch=16)
dev.off()


#drains by mining pools
nzt <- read.csv('nonzerotrans.csv',head=F)
names(nzt) <- c('address','trans','amount','date')
nzt$date <- as.Date(nzt$date)

blocks <- read.csv('drained_by_none_transactions.txt.bal',head=F)
names(blocks) <- c('trans','block')
nzt <- merge(nzt, blocks,all.x=T)

pools <- read.csv('block-to-pool.txt',head=F)
names(pools) <- c('block','pool')
nzt <- merge(nzt,pools, all.x=T)

nzt<-merge(nzt,bsx,by.x="date",by.y="tradeday",all.x=T)
nzt$totalusd<-nzt$btcusd*nzt$amount/100000000.


